Jan-Philipp Kolb
9 Mai 2017
.xlsx, .csv, .dta oder ähnliches abgespeichert sondern in einem der folgenden Formate: .json, .xml etc.Die Struktur der Daten kann man sich mit einem JSON Viewer anschauen
jsonliteinstall.packages("jsonlite")library(jsonlite)
citation("jsonlite")##
## To cite jsonlite in publications use:
##
## Jeroen Ooms (2014). The jsonlite Package: A Practical and
## Consistent Mapping Between JSON Data and R Objects.
## arXiv:1403.2805 [stat.CO] URL http://arxiv.org/abs/1403.2805.
##
## A BibTeX entry for LaTeX users is
##
## @Article{,
## title = {The jsonlite Package: A Practical and Consistent Mapping Between JSON Data and R Objects},
## author = {Jeroen Ooms},
## journal = {arXiv:1403.2805 [stat.CO]},
## year = {2014},
## url = {http://arxiv.org/abs/1403.2805},
## }
library("jsonlite")
DRINKWATER <- fromJSON("data/RomDrinkingWater.geojson")names(DRINKWATER)[1:3]## [1] "type" "generator" "copyright"
names(DRINKWATER)[4:5]## [1] "timestamp" "features"
head(DRINKWATER$features)## type id properties.@id properties.amenity properties.flow
## 1 Feature node/246574149 node/246574149 drinking_water push-button
## 2 Feature node/246574150 node/246574150 drinking_water <NA>
## 3 Feature node/246574151 node/246574151 drinking_water <NA>
## 4 Feature node/248743324 node/248743324 drinking_water <NA>
## 5 Feature node/251773348 node/251773348 drinking_water <NA>
## 6 Feature node/251773551 node/251773551 drinking_water <NA>
## properties.type properties.name properties.name:fr properties.wheelchair
## 1 nasone <NA> <NA> <NA>
## 2 <NA> <NA> <NA> <NA>
## 3 <NA> <NA> <NA> <NA>
## 4 <NA> <NA> <NA> <NA>
## 5 nasone <NA> <NA> <NA>
## 6 <NA> Acqua Marcia Eau potable yes
## properties.created_by properties.indoor geometry.type
## 1 <NA> <NA> Point
## 2 <NA> <NA> Point
## 3 <NA> <NA> Point
## 4 <NA> <NA> Point
## 5 <NA> <NA> Point
## 6 <NA> <NA> Point
## geometry.coordinates
## 1 12.49191, 41.89479
## 2 12.49095, 41.89489
## 3 12.48774, 41.89450
## 4 12.48773, 41.89354
## 5 12.48529, 41.88539
## 6 12.48386, 41.89332
my_repos <- fromJSON("https://api.github.com/users/japhilko/repos")names(my_repos)## [1] "id" "name" "full_name"
## [4] "owner" "private" "html_url"
## [7] "description" "fork" "url"
## [10] "forks_url" "keys_url" "collaborators_url"
## [13] "teams_url" "hooks_url" "issue_events_url"
## [16] "events_url" "assignees_url" "branches_url"
## [19] "tags_url" "blobs_url" "git_tags_url"
## [22] "git_refs_url" "trees_url" "statuses_url"
## [25] "languages_url" "stargazers_url" "contributors_url"
## [28] "subscribers_url" "subscription_url" "commits_url"
## [31] "git_commits_url" "comments_url" "issue_comment_url"
## [34] "contents_url" "compare_url" "merges_url"
## [37] "archive_url" "downloads_url" "issues_url"
## [40] "pulls_url" "milestones_url" "notifications_url"
## [43] "labels_url" "releases_url" "deployments_url"
## [46] "created_at" "updated_at" "pushed_at"
## [49] "git_url" "ssh_url" "clone_url"
## [52] "svn_url" "homepage" "size"
## [55] "stargazers_count" "watchers_count" "language"
## [58] "has_issues" "has_projects" "has_downloads"
## [61] "has_wiki" "has_pages" "forks_count"
## [64] "mirror_url" "open_issues_count" "forks"
## [67] "open_issues" "watchers" "default_branch"
library(jsonlite)
res <- fromJSON('http://ergast.com/api/f1/2004/1/results.json')
drivers <- res$MRData$RaceTable$Races$Results[[1]]$Driver
colnames(drivers)## [1] "driverId" "code" "url" "givenName"
## [5] "familyName" "dateOfBirth" "nationality" "permanentNumber"
article_key <- "&api-key=c2fede7bd9aea57c898f538e5ec0a1ee:6:68700045"
url <- "http://api.nytimes.com/svc/search/v2/articlesearch.json?q=obamacare+socialism"
req <- fromJSON(paste0(url, article_key))
articles <- req$response$docs
colnames(articles)## [1] "web_url" "snippet" "lead_paragraph"
## [4] "abstract" "print_page" "blog"
## [7] "source" "multimedia" "headline"
## [10] "keywords" "pub_date" "document_type"
## [13] "news_desk" "section_name" "subsection_name"
## [16] "byline" "type_of_material" "_id"
## [19] "word_count" "slideshow_credits"
XML Paketlibrary(XML)
citation("XML")##
## To cite package 'XML' in publications use:
##
## Duncan Temple Lang and the CRAN Team (2016). XML: Tools for
## Parsing and Generating XML Within R and S-Plus. R package
## version 3.98-1.5. https://CRAN.R-project.org/package=XML
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {XML: Tools for Parsing and Generating XML Within R and S-Plus},
## author = {Duncan Temple Lang and the CRAN Team},
## year = {2016},
## note = {R package version 3.98-1.5},
## url = {https://CRAN.R-project.org/package=XML},
## }
##
## ATTENTION: This citation information has been auto-generated from
## the package DESCRIPTION file and may need manual editing, see
## 'help("citation")'.
xml2 Paketinstall.packages("xml2")library(xml2)
citation("xml2")##
## To cite package 'xml2' in publications use:
##
## Hadley Wickham and James Hester (2016). xml2: Parse XML. R
## package version 1.0.0. https://CRAN.R-project.org/package=xml2
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {xml2: Parse XML},
## author = {Hadley Wickham and James Hester},
## year = {2016},
## note = {R package version 1.0.0},
## url = {https://CRAN.R-project.org/package=xml2},
## }
url <- "http://api.openstreetmap.org/api/0.6/
relation/62422"library(xml2)
BE <- xmlParse(url)Administrative Grenzen Berlin
xmltop = xmlRoot(BE)
class(xmltop)## [1] "XMLInternalElementNode" "XMLInternalNode"
## [3] "XMLAbstractNode"
xmlSize(xmltop)## [1] 1
xmlSize(xmltop[[1]])## [1] 328
Xpath, the XML Path Language, is a query language for selecting nodes from an XML document.
xpathApply(BE,"//tag[@k = 'source:population']")## [[1]]
## <tag k="source:population" v="http://www.statistik-berlin-brandenburg.de/Publikationen/Stat_Berichte/2010/SB_A1-1_A2-4_q01-10_BE.pdf 2010-10-01"/>
##
## attr(,"class")
## [1] "XMLNodeSet"
url2 <- "http://api.openstreetmap.org/api/0.6/node/2923760808"
RennesBa <- xmlParse(url2)url3 <- "http://api.openstreetmap.org/api/0.6/way/72799743"
MadCalle <- xmlParse(url3)Logo Overpass API
The Overpass API is a read-only API that serves up custom selected parts of the OSM map data.
Spielplätze Mannheim
Export Rohdaten
Link1 <- "http://www.overpass-api.de/api/interpreter?
data=[maxsize:1073741824][timeout:900];area[name=\""library(XML)
place <- "Mannheim"
type_obj <- "node"
object <- "leisure=playground"
InfoList <- xmlParse(paste(Link1,place,"\"];",
type_obj,"(area)[",object,"];out;",sep=""))Spielplätze in Mannheim
Die Liste der ID’s mit dem Wert playground:
node_id <- xpathApply(InfoList,
"//tag[@v= 'playground']/parent::node/@ id")
## node_id[[1]]Erste node id
lat_x <- xpathApply(InfoList,
"//tag[@v= 'playground']/parent::node/@ lat")
# lat_x[[1]];lat_x[[2]]lat_x <- xpathApply(InfoList,
"//tag[@v= 'playground']/parent::node/@ lon")Latitude Koordinate
library(devtools)
install_github("Japhilko/gosmd")library(gosmd)## Loading required package: maptools
## Loading required package: sp
## Checking rgeos availability: TRUE
## Loading required package: RJSONIO
##
## Attaching package: 'RJSONIO'
## The following objects are masked from 'package:jsonlite':
##
## fromJSON, toJSON
## Loading required package: stringr
pg_MA <- get_osm_nodes(object="leisure=playground",
"Mannheim")
info <- extract_osm_nodes(OSM.Data=pg_MA,
value="playground")| leisure | lat | lon | note | |
|---|---|---|---|---|
| 30560755 | playground | 49.51910 | 8.502807 | NA |
| 76468450 | playground | 49.49633 | 8.539396 | Rutsche, Schaukel, großer Sandkasten, Tischtennis |
| 76468534 | playground | 49.49678 | 8.552959 | NA |
| 76468535 | playground | 49.49230 | 8.548750 | NA |
| 76468536 | playground | 49.50243 | 8.548140 | Schaukel, Rutsche, Sandkasten, Spielhäuser, Tischtennis |
| 76468558 | playground | 49.49759 | 8.542036 | NA |
http://www.stat.berkeley.edu/~statcur/Workshop2/Presentations/XML.pdf
http://www.di.fc.ul.pt/~jpn/r/web/index.html#parsing-xml
http://www.w3schools.com/xml/xquery_intro.asp
http://giventhedata.blogspot.de/2012/06/r-and-web-for-beginners-part-ii-xml-in.html
http://gastonsanchez.com/Handling_and_Processing_Strings_in_R.pdf
XML - Gaston Sanchezlibrary("XML")Gaston Sanchez - Dataflow
Seine Arbeit sieht man hier.
Gaston Sanchez - Webdaten bekommen
| Function | Description |
|---|---|
| xmlName() | name of the node |
| xmlSize() | number of subnodes |
| xmlAttrs() | named character vector of all attributes |
| xmlGetAttr() | value of a single attribute |
| xmlValue() | contents of a leaf node |
| xmlParent() | name of parent node |
| xmlAncestors() | name of ancestor nodes |
| getSibling() | siblings to the right or to the left |
| xmlNamespace() | the namespace (if there’s one) |
Administrative Grenzen für Deutschland
url <- "http://api.openstreetmap.org/api/0.6/relation/62422"BE <- xmlParse(url)Administrative Grenzen Berlin
xmltop = xmlRoot(BE)
class(xmltop)## [1] "XMLInternalElementNode" "XMLInternalNode"
## [3] "XMLAbstractNode"
xmlSize(xmltop)## [1] 1
xmlSize(xmltop[[1]])## [1] 328
Xpath, the XML Path Language, is a query language for selecting nodes from an XML document.
xpathApply(BE,"//tag[@k = 'population']")## [[1]]
## <tag k="population" v="3440441"/>
##
## attr(,"class")
## [1] "XMLNodeSet"
xpathApply(BE,"//tag[@k = 'source:population']")## [[1]]
## <tag k="source:population" v="http://www.statistik-berlin-brandenburg.de/Publikationen/Stat_Berichte/2010/SB_A1-1_A2-4_q01-10_BE.pdf 2010-10-01"/>
##
## attr(,"class")
## [1] "XMLNodeSet"
xpathApply(BE,"//tag[@k = 'name:ta']")## [[1]]
## <tag k="name:ta" v="<U+0BAA><U+0BC6><U+0BB0><U+0BCD><U+0BB2><U+0BBF><U+0BA9><U+0BCD>"/>
##
## attr(,"class")
## [1] "XMLNodeSet"
region <- xpathApply(BE,
"//tag[@k = 'geographical_region']")
# regular expressions
region[[1]]## <tag k="geographical_region" v="Barnim;Berliner Urstromtal;Teltow;Nauener Platte"/>
<tag k="geographical_region"
v="Barnim;Berliner Urstromtal;
Teltow;Nauener Platte"/>
Barnim
url2<-"http://api.openstreetmap.org/api/0.6/node/25113879"
obj2<-xmlParse(url2)
obj_amenity<-xpathApply(obj2,"//tag[@k = 'amenity']")[[1]]
obj_amenity## <tag k="amenity" v="university"/>
xpathApply(obj2,"//tag[@k = 'wikipedia']")[[1]]## <tag k="wikipedia" v="de:Universität Mannheim"/>
xpathApply(obj2,"//tag[@k = 'wheelchair']")[[1]]xpathApply(obj2,"//tag[@k = 'name']")[[1]]url3<-"http://api.openstreetmap.org/api/0.6/node/303550876"
obj3 <- xmlParse(url3)
xpathApply(obj3,"//tag[@k = 'opening_hours']")[[1]]## <tag k="opening_hours" v="Mo-Sa 09:00-20:00; Su,PH off"/>
url4<-"http://api.openstreetmap.org/api/0.6/node/25439439"
obj4 <- xmlParse(url4)
xpathApply(obj4,"//tag[@k = 'railway:station_category']")[[1]]## <tag k="railway:station_category" v="2"/>
library(rvest)##
## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
##
## xml
bhfkat<-read_html(
"https://de.wikipedia.org/wiki/Bahnhofskategorie")
df_html_bhfkat<-html_table(
html_nodes(bhfkat, "table")[[1]],fill = TRUE)| Stufe | Bahnsteigkanten | Bahnsteiglänge | Reisende/Tag | Zughalte/Tag |
|---|---|---|---|---|
| 6 | 01 | > 000 bis 090 m | 00000 bis 00049 | 000 bis 0010 |
| 5 | 02 | > 090 bis 140 m | 00050 bis 00299 | 011 bis 0050 |
| 4 | 03 bis 04 | > 140 bis 170 m | 00300 bis 00999 | 051 bis 0100 |
| 3 | 05 bis 09 | > 170 bis 210 m | 01000 bis 09999 | 101 bis 0500 |
| 2 | 10 bis 14 | > 210 bis 280 m | 10.000 bis 49.999 | 501 bis 1000 |
| 1 | 00i ab 15 | > 280 m | 00000i ab 50.000 | 000i ab 1001 |
url5<-"http://api.openstreetmap.org/api/0.6/way/162149882"
obj5<-xmlParse(url5)
xpathApply(obj5,"//tag[@k = 'name']")[[1]]## <tag k="name" v="City-Airport Mannheim"/>
xpathApply(obj5,"//tag[@k = 'website']")[[1]]## <tag k="website" v="http://www.flugplatz-mannheim.de/"/>
xpathApply(obj5,"//tag[@k = 'iata']")[[1]]## <tag k="iata" v="MHG"/>
Deborah Nolan - Extracting data from XML
Duncan Temple Lang - A Short Introduction to the XML package for R
Noch mehr Informationen
rvestlibrary(rvest)
ht <- read_html('https://www.google.co.in/search?q=guitar+repair+workshop')
links <- ht %>% html_nodes(xpath='//h3/a') %>% html_attr('href')
gsub('/url\\?q=','',sapply(strsplit(links[as.vector(grep('url',links))],split='&'),'[',1))## [1] "http://theguitarrepairworkshop.com/"
## [2] "http://www.guitarservices.com/"
## [3] "http://www.guitarrepairbench.com/guitar-building-projects/guitar-workshop/guitar-workshop-project.html"
## [4] "https://www.facebook.com/The-Guitar-Repair-Workshop-847517635259712/"
## [5] "https://www.taylorguitars.com/dealer/guitar-repair-workshop-ltd"
## [6] "http://www.laweekly.com/music/10-best-guitar-repair-shops-in-los-angeles-4647166"
## [7] "https://www.justdial.com/Mumbai/Guitar-Repair-Services/nct-10988623"
## [8] "https://www.justdial.com/Delhi-NCR/Guitar-Repair-Services/nct-10988623"
## [9] "http://guitarworkshopglasgow.com/pages/repairs-1"
## [10] "http://www.google.co.in/aclk?sa=l"
install.packages("tidyverse")library(tidyverse)
library(stringr)
library(forcats)
library(ggmap)
library(rvest)html.world_ports <- read_html("https://en.wikipedia.org/wiki/List_of_busiest_container_ports")
df.world_ports <- html_table(html_nodes(html.world_ports, "table")[[2]], fill = TRUE)glimpse(df.world_ports)## Observations: 50
## Variables: 15
## $ Rank <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
## $ Port <chr> "Shanghai", "Singapore", "Shenzhen", "Ningbo-Zhoushan...
## $ Economy <chr> "China", "Singapore", "China", "China", "Hong Kong", ...
## $ 2015[1] <chr> "36,516", "30,922", "24,142", "20,636", "20,073", "19...
## $ 2014[2] <chr> "35,268", "33,869", "23,798", "19,450", "22,374", "18...
## $ 2013[3] <chr> "33,617", "32,240", "23,280", "17,351", "22,352", "17...
## $ 2012[4] <chr> "32,529", "31,649", "22,940", "16,670", "23,117", "17...
## $ 2011[5] <chr> "31,700", "29,937", "22,570", "14,686", "24,384", "16...
## $ 2010[6] <chr> "29,069", "28,431", "22,510", "13,144", "23,532", "14...
## $ 2009[7] <chr> "25,002", "25,866", "18,250", "10,502", "20,983", "11...
## $ 2008[8] <chr> "27,980", "29,918", "21,414", "11,226", "24,248", "13...
## $ 2007[9] <chr> "26,150", "27,932", "21,099", "9,349", "23,881", "13,...
## $ 2006[10] <chr> "21,710", "24,792", "18,469", "7,068", "23,539", "12,...
## $ 2005[11] <chr> "18,084", "23,192", "16,197", "5,208", "22,427", "11,...
## $ 2004[12] <chr> "14,557", "21,329", "13,615", "4,006", "21,984", "11,...
rvestlibrary(rvest)
ht <- read_html('https://www.google.co.in/search?q=guitar+repair+workshop')
links <- ht %>% html_nodes(xpath='//h3/a') %>% html_attr('href')
gsub('/url\\?q=','',sapply(strsplit(links[as.vector(grep('url',links))],split='&'),'[',1))## [1] "http://theguitarrepairworkshop.com/"
## [2] "http://www.guitarservices.com/"
## [3] "http://www.guitarrepairbench.com/guitar-building-projects/guitar-workshop/guitar-workshop-project.html"
## [4] "https://www.facebook.com/The-Guitar-Repair-Workshop-847517635259712/"
## [5] "https://www.taylorguitars.com/dealer/guitar-repair-workshop-ltd"
## [6] "http://www.laweekly.com/music/10-best-guitar-repair-shops-in-los-angeles-4647166"
## [7] "https://www.justdial.com/Mumbai/Guitar-Repair-Services/nct-10988623"
## [8] "https://www.justdial.com/Delhi-NCR/Guitar-Repair-Services/nct-10988623"
## [9] "http://guitarworkshopglasgow.com/pages/repairs-1"
## [10] "http://www.google.co.in/aclk?sa=l"
Im Folgenden werde ich zeigen, wie man Textinformationen aus Wikipedia herunterladen, verarbeiten und analysieren kann.
install.packages("NLP")
install.packages("tm")
install.packages("FactoMineR")stringi von Marek Gagolewski and Bartek Tartanus bietet Möglichkeiten zur String Verarbeitung.library("stringi")tm ist ein R-Paket um Text Mining zu realisieren. Es wurde von Ingo Feinerer, Kurt Hornik, und David Meyer geschrieben.library("tm")FactoMineR-Paket, das von Sebastien Le, Julie Josse und Francois Husson zur Durchführung der Hauptkomponentenanalyse erstellt wurde.library("FactoMineR")wiki <- "http://de.wikipedia.org/wiki/"
titles <- c("Zika-Virus", "Influenza-A-Virus_H1N1",
"Spanische_Grippe","Influenzavirus",
"Vogelgrippe_H5N1",
"Legionellose-Ausbruch_in_Warstein_2013",
"Legionellose-Ausbruch_in_Jülich_2014")articles <- character(length(titles))
for (i in 1:length(titles)){
articles[i] <- stri_flatten(
readLines(stri_paste(wiki, titles[i])), col = " ")
}
docs <- Corpus(VectorSource(articles))Das Folgende basiert auf einem Blogpost von Norbert Ryciak über die automatische Kategorisierung von Wikipedia-Artikeln.
docs2 <- tm_map(docs, function(x) stri_replace_all_regex(
x, "<.+?>", " "))
docs3 <- tm_map(docs2, function(x) stri_replace_all_fixed(
x, "\t", " "))docs4 <- tm_map(docs3, PlainTextDocument)
docs5 <- tm_map(docs4, stripWhitespace)
docs6 <- tm_map(docs5, removeWords, stopwords("german"))
docs7 <- tm_map(docs6, removePunctuation)
docs8 <- tm_map(docs7, tolower)
# docs8 <- tm_map(docs8, PlainTextDocument)dtm <- DocumentTermMatrix(docs8) dtm2 <- as.matrix(dtm)
frequency <- colSums(dtm2)
frequency <- sort(frequency, decreasing=TRUE)
words <- frequency[frequency>20]
s <- dtm2[1,which(colnames(dtm2) %in% names(words))]
for(i in 2:nrow(dtm2)){
s <- cbind(s,dtm2[i,which(colnames(dtm2) %in%
names(words))])
}
colnames(s) <- titlesPCA(s)## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 125 individuals, described by 7 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
s0 <- s/apply(s,1,sd)
h <- hclust(dist(t(s0)), method = "ward")
plot(h, labels = titles, sub = "")